#df_analysis = pd.read_parquet('df_analysis.parquet', engine='pyarrow')
#- seperated from df_analysis into positive and negative
#df_positive = pd.read_parquet('df_positive.parquet', engine='pyarrow')
#df_negative = pd.read_parquet('df_negative.parquet', engine='pyarrow')
#lda_model_266 = topics on 37 topics of general corpus (it is 37 because i probably reduced it)
#lda_model_266 = topics on 37 topics of general corpus
import torch
print("CUDA is available:" , torch.cuda.is_available())
print("Number of CUDA devices:", torch.cuda.device_count())
print("CUDA version used by PyTorch:", torch.version.cuda)
for i in range(torch.cuda.device_count()):
print(torch.cuda.get_device_name(i)) # prints the name of each available GPU
CUDA is available: True Number of CUDA devices: 2 CUDA version used by PyTorch: 11.6 Tesla T4 Tesla T4
import numpy as np
import pandas as pd
# from gensim import corpora, models
# from gensim.models import Phrases
# from gensim.models import CoherenceModel
# from gensim.models.ldamodel import LdaModel
# from gensim.models.ldamulticore import LdaMulticore
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
from tqdm import tqdm
import re
import nltk
import os
import seaborn as sns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 600)
#nltk.download('stopwords')
#!pip install gensim
#!pip install pyLDAvis
#!pip install nltk
#!pip install pyarrow
%%time
df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
df_news_final_project.shape
CPU times: user 6.78 s, sys: 3.92 s, total: 10.7 s Wall time: 14 s
(200332, 5)
#save this dataframe so i do not have to keep reading it in
#df_news_final_project.to_parquet('news_final_project.parquet', engine='pyarrow')
#!pip install pandarallel
import pandarallel
from pandarallel import pandarallel
import multiprocessing
num_processors = multiprocessing.cpu_count()
# Now you can use num_processors in your code.
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)
INFO: Pandarallel will run on 31 workers. INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
# expand length of column width
pd.set_option('display.max_colwidth', 1000)
#df_news_final_project.head()
# value counts of language
df_news_final_project['language'].value_counts()
en 200332 Name: language, dtype: int64
#around 60k news articles with the same titles
#around 1500 news articles with the same text - removed this
#decided to only drop the same text because it may be the same articles
#im going to later analyze news articles with same titles to show importance! this will be important to see trends
df_news_final_project = df_news_final_project.drop_duplicates(subset=['text'])
df_news_final_project.shape
(198564, 5)
#take a sample of 1000 rows
#df_news_final_project_sample = df_news_final_project.sample(n=1000, random_state=1)
# reset index
df_news_final_project.reset_index(drop=True, inplace=True)
%%time
# Clean-up the noise, by eliminating newlines, tabs, remnants of web crawls, and other irrelevant text
import re
df_news_final_project['content_clean'] = df_news_final_project['text'].str.replace('\n', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('\t', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('\r', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('<!--.*?-->', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('<script.*?>.*?</script>', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('<.*?>', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('http\S+|www.\S+', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('\s+', ' ')
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].str.replace('^\s+|\s+$', ' ')
CPU times: user 2min 58s, sys: 4.42 s, total: 3min 3s Wall time: 3min 2s
%%time
def clean_text(text):
# Remove sentences with word length greater than 100
sentences = re.split(r'(?<=[.!?])\s+', text)
cleaned_sentences = [sentence for sentence in sentences if not any(len(word) > 100 for word in sentence.split())]
# Remove words with more than 20 letters
cleaned_text = ' '.join([word for sentence in cleaned_sentences for word in sentence.split() if len(word) <= 20])
return cleaned_text
df_news_final_project['content_clean'] = df_news_final_project['content_clean'].apply(clean_text)
CPU times: user 1min 43s, sys: 32.6 ms, total: 1min 43s Wall time: 1min 43s
pd.set_option('display.max_colwidth', 1000)
ai_terms = ["Artificial Intelligence", "Machine Learning", "Deep Learning", "Neural Network", "Natural Language Processing", "Supervised Learning", "Unsupervised Learning", "Reinforcement Learning", "Generative Adversarial Networks", "Convolutional Neural Networks", "Recurrent Neural Networks", "Transfer Learning", "Data Mining", "Big Data", "Algorithm","large language models","llms","robotics","Chatbot", "Robotic Process Automation", "Computer Vision", "Image Recognition", "Speech Recognition", "Text Analytics", "Sentiment Analysis", "Autonomous Vehicles", "Internet of Things", "Edge Computing", "Quantum Computing"]
#filter dataframe
ai_terms = '|'.join(ai_terms)
df_news_final_project = df_news_final_project[df_news_final_project['content_clean'].str.contains(ai_terms, case=False, na=False)]
#df_news_final_project.head(1)
df_news_final_project.to_parquet('content_clean_spacy_final_project.parquet', engine='pyarrow')
df_analysis_sample = df_news_final_project.sample(n=1000) # Adjust the sample size as needed
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
%%time
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON', 'NORP']
# Function to extract top entities by label and year
def extract_top_entities_by_label_and_year(documents, label):
entities_by_year = {}
for doc, year in zip(nlp.pipe(documents), df_analysis_sample['date'].dt.year):
if year not in entities_by_year:
entities_by_year[year] = []
entities_by_year[year].extend([ent.text for ent in doc.ents if ent.label_ == label])
return entities_by_year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
top_entities_by_label_and_year = extract_top_entities_by_label_and_year(df_analysis_sample['content_clean'], label)
max_length = max(len(entities) for entities in top_entities_by_label_and_year.values())
padded_entities_by_year = {year: entities + [''] * (max_length - len(entities))
for year, entities in top_entities_by_label_and_year.items()}
df_entities_by_label[label] = pd.DataFrame(padded_entities_by_year)
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
# print(top_org_entiies)
CPU times: user 8min 28s, sys: 12.1 s, total: 8min 40s Wall time: 8min 40s
%%time
# Clean-up the noise, by eliminating newlines, tabs, remnants of web crawls, and other irrelevant text
import re
df_news_final_project['cleaned_text'] = df_news_final_project['text'].str.replace('\n', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('\t', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('\r', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('<!--.*?-->', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('<script.*?>.*?</script>', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('<.*?>', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('http\S+|www.\S+', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('\s+', ' ')
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].str.replace('^\s+|\s+$', ' ')
import nltk
import string
stopwords = set(nltk.corpus.stopwords.words('english'))
df_news_final_project['clean_title'] = df_news_final_project['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
df_news_final_project['clean_title'] = df_news_final_project['clean_title'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
df_news_final_project['cleaned_text'] = df_news_final_project['cleaned_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english')) # define stopwords
def clean(string):
url_pattern = r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
mention_pattern = r'[\s]*@[\w]+'
hashtag_pattern = r'[\s]*#[\w]+'
copyright_pattern = r"©.*"
string_fixed = re.sub(url_pattern, "", string)
string_fixed = re.sub(hashtag_pattern, "", string_fixed)
string_fixed = re.sub(mention_pattern, "", string_fixed)
string_fixed = re.sub(copyright_pattern, "", string_fixed) # Remove everything after "©"
string_fixed = string_fixed.replace('\n', ' ')
# Convert to lower case and remove stop words without tokenization
words = string_fixed.split() # split string into words
words = [word.lower() for word in words if word.lower() not in stopwords and word.isalpha()]
return " ".join(words)
# Applying the function to the dataframe
df_news_final_project['cleaned_text'] = df_news_final_project['content_clean'].apply(clean)
df_news_final_project['clean_title'] = df_news_final_project['clean_title'].apply(clean)
ai_terms = ["Artificial Intelligence", "Machine Learning", "Deep Learning", "Neural Network", "Natural Language Processing", "Supervised Learning", "Unsupervised Learning", "Reinforcement Learning", "Generative Adversarial Networks", "Convolutional Neural Networks", "Recurrent Neural Networks", "Transfer Learning", "Data Mining", "Big Data", "Algorithm","large language models","llms","robotics","Chatbot", "Robotic Process Automation", "Computer Vision", "Image Recognition", "Speech Recognition", "Text Analytics", "Sentiment Analysis", "Autonomous Vehicles", "Internet of Things", "Edge Computing", "Quantum Computing"]
#filter dataframe
ai_terms = '|'.join(ai_terms)
ai_terms
'Artificial Intelligence|Machine Learning|Deep Learning|Neural Network|Natural Language Processing|Supervised Learning|Unsupervised Learning|Reinforcement Learning|Generative Adversarial Networks|Convolutional Neural Networks|Recurrent Neural Networks|Transfer Learning|Data Mining|Big Data|Algorithm|large language models|llms|robotics|Chatbot|Robotic Process Automation|Computer Vision|Image Recognition|Speech Recognition|Text Analytics|Sentiment Analysis|Autonomous Vehicles|Internet of Things|Edge Computing|Quantum Computing'
df_filtered = df_news_final_project[df_news_final_project['cleaned_text'].str.contains(ai_terms, case=False, na=False)]
df_filtered.shape
(164097, 8)
# #before cleaning - DONT RUN THIS
# import seaborn as sns
# df_news_final_project_sample = df_news_final_project_sample.dropna()
# df_news_final_project_sample = df_news_final_project_sample.drop_duplicates(subset=['cleaned_text'])
# df_news_final_project_sample['len_clean_text'] = df_news_final_project_sample['cleaned_text'].apply(lambda x: len(x))
# sns.displot(data=df_news_final_project_sample,x='len_clean_text')
<seaborn.axisgrid.FacetGrid at 0x1ccd33bb670>
#after cleaning
import seaborn as sns
df_news_final_project_sample = df_news_final_project_sample.dropna()
df_news_final_project_sample = df_news_final_project_sample.drop_duplicates(subset=['cleaned_text'])
df_news_final_project_sample['len_clean_text_afterclean'] = df_news_final_project_sample['cleaned_text'].apply(lambda x: len(x))
sns.displot(data=df_news_final_project_sample,x='len_clean_text_afterclean')
<seaborn.axisgrid.FacetGrid at 0x1cce0d41d80>
news_df = df_news_final_project
news_df.shape
(198564, 8)
tokenizer = nltk.tokenize.TweetTokenizer()
# This creates a list of all tokenized sentences
tokenized_sentences = news_df['cleaned_text'].apply(tokenizer.tokenize).tolist()
# This will flatten the list of lists into a single list
words = [word for sublist in tokenized_sentences for word in sublist]
bgs = nltk.bigrams(words)
targeted_bgs = ['machine','learning','ai','artificial','intelligence','natural','language','processing','chatgpt','data','science','python','r','c','analytics','ml','nlp','generative','legal','office','law','llm','large','language','model','management','business','finance','financial','industry','healthcare','production','service','manufacture','productivity','job','GenAI']
bgs = [b for b in bgs if (b[0] in targeted_bgs) or (b[1] in targeted_bgs)]
bigrams_freq = nltk.FreqDist(bgs)
bigrams_freq_df = pd.DataFrame(bigrams_freq.most_common(),columns=['Word', 'Frequency'])
bigrams_freq_df.head(n=50)
| Word | Frequency | |
|---|---|---|
| 0 | (artificial, intelligence) | 621509 |
| 1 | (machine, learning) | 239444 |
| 2 | (intelligence, ai) | 126521 |
| 3 | (data, science) | 76450 |
| 4 | (global, artificial) | 65781 |
| 5 | (generative, ai) | 44736 |
| 6 | (financial, services) | 43689 |
| 7 | (ai, powered) | 41162 |
| 8 | (ai, technology) | 39027 |
| 9 | (ai, market) | 35826 |
| 10 | (deep, learning) | 34066 |
| 11 | (data, analytics) | 32245 |
| 12 | (big, data) | 31612 |
| 13 | (intelligence, market) | 31439 |
| 14 | (use, ai) | 30618 |
| 15 | (edge, ai) | 29760 |
| 16 | (conversational, ai) | 28450 |
| 17 | (natural, language) | 27002 |
| 18 | (ai, platform) | 26505 |
| 19 | (using, ai) | 25772 |
| 20 | (ai, software) | 25229 |
| 21 | (new, ai) | 25147 |
| 22 | (global, ai) | 24950 |
| 23 | (ai, based) | 24580 |
| 24 | (ai, systems) | 24570 |
| 25 | (ai, machine) | 22841 |
| 26 | (business, technology) | 22785 |
| 27 | (heavy, industry) | 22198 |
| 28 | (ai, tools) | 21756 |
| 29 | (language, processing) | 19690 |
| 30 | (customer, service) | 19350 |
| 31 | (business, news) | 19172 |
| 32 | (ai, solutions) | 19141 |
| 33 | (small, business) | 18987 |
| 34 | (service, market) | 18930 |
| 35 | (ai, models) | 18850 |
| 36 | (industry, analysis) | 18464 |
| 37 | (ai, chatbot) | 18255 |
| 38 | (ai, ml) | 17819 |
| 39 | (industry, manufacturing) | 17761 |
| 40 | (general, business) | 17627 |
| 41 | (terms, service) | 17253 |
| 42 | (science, tech) | 17124 |
| 43 | (energy, natural) | 16637 |
| 44 | (data, scientists) | 16567 |
| 45 | (market, industry) | 16276 |
| 46 | (intelligence, software) | 15570 |
| 47 | (ai, driven) | 15262 |
| 48 | (science, technology) | 15032 |
| 49 | (healthcare, market) | 14841 |
# assuming 'words' is a list of tokenized words from your document
fourgrams = nltk.ngrams(words, 4)
targeted_fourgrams = [
'machine', 'learning', 'ai', 'artificial', 'intelligence', 'natural',
'language', 'processing', 'chatgpt', 'data', 'science', 'python', 'r',
'c', 'analytics', 'ml', 'nlp', 'generative', 'legal', 'office', 'law',
'llm', 'large', 'language', 'model', 'management', 'business', 'finance',
'financial', 'industry', 'healthcare', 'production', 'service',
'manufacture', 'productivity', 'job', 'GenAI'
]
fourgrams = [
b for b in fourgrams
if (b[0] in targeted_fourgrams) or (b[1] in targeted_fourgrams)
or (b[2] in targeted_fourgrams) or (b[3] in targeted_fourgrams)
]
fourgrams_dist = nltk.FreqDist(fourgrams)
fourgrams_dist_df = pd.DataFrame(fourgrams_dist.most_common(), columns=['Word', 'Frequency'])
fourgrams_dist_df.head(n=50)
| Word | Frequency | |
|---|---|---|
| 0 | (global, artificial, intelligence, ai) | 102 |
| 1 | (artificial, intelligence, ai, service) | 86 |
| 2 | (call, center, ai, market) | 73 |
| 3 | (artificial, intelligence, energy, market) | 71 |
| 4 | (intelligence, energy, market, market) | 62 |
| 5 | (artificial, intelligencebased, security, market) | 56 |
| 6 | (ai, computer, vision, market) | 53 |
| 7 | (artificial, intelligence, software, market) | 53 |
| 8 | (cyber, data, science, market) | 52 |
| 9 | (automotive, artificial, intelligence, software) | 47 |
| 10 | (data, analytics, electronic, commerce) | 46 |
| 11 | (analytics, electronic, commerce, electronic) | 46 |
| 12 | (artificial, intelligence, oil, gas) | 45 |
| 13 | (artificial, intelligence, supply, chain) | 44 |
| 14 | (artificial, intelligence, manufacturing, market) | 44 |
| 15 | (artificial, intelligence, manufacturing, supply) | 43 |
| 16 | (intelligence, ai, service, market) | 43 |
| 17 | (artificial, intelligence, advisory, service) | 43 |
| 18 | (artificial, intelligence, ai, manufacturing) | 42 |
| 19 | (artificial, intelligence, robotics, market) | 41 |
| 20 | (intelligence, supply, chain, market) | 41 |
| 21 | (intelligence, manufacturing, supply, chain) | 41 |
| 22 | (artificial, intelligence, diabetes, management) | 41 |
| 23 | (privacy, policy, terms, service) | 40 |
| 24 | (artificial, intelligence, automotive, market) | 40 |
| 25 | (intelligence, ai, drug, discovery) | 39 |
| 26 | (artificial, intelligence, ai, drug) | 38 |
| 27 | (artificial, intelligence, ai, agriculture) | 38 |
| 28 | (ai, social, media, market) | 36 |
| 29 | (artificial, intelligence, aviation, market) | 36 |
| 30 | (artificial, intelligence, education, sector) | 36 |
| 31 | (automotive, artificial, intelligence, market) | 35 |
| 32 | (ai, networking, solution, market) | 35 |
| 33 | (nasdaq, artificial, intelligence, robotics) | 34 |
| 34 | (ai, drug, discovery, market) | 34 |
| 35 | (wearable, ai, device, market) | 34 |
| 36 | (artificial, intelligence, platforms, market) | 34 |
| 37 | (intelligence, ai, fintech, market) | 33 |
| 38 | (intelligence, ai, agriculture, market) | 33 |
| 39 | (artificial, intelligence, healthcare, market) | 32 |
| 40 | (artificial, intelligence, ai, fintech) | 32 |
| 41 | (first, trust, nasdaq, artificial) | 31 |
| 42 | (trust, nasdaq, artificial, intelligence) | 31 |
| 43 | (trending, topics, business, money) | 31 |
| 44 | (topics, business, money, auto) | 31 |
| 45 | (intelligence, advisory, service, market) | 31 |
| 46 | (artificial, intelligence, robotics, etf) | 30 |
| 47 | (artificial, intelligence, service, market) | 30 |
| 48 | (global, ai, networking, solution) | 30 |
| 49 | (financial, services, investing, general) | 29 |
#!pip install -q transformers
# created on GCP Vertex AI Notebook using `Python 3 (CUDA Toolkit 11.0)` environment
# using n1-standard-4 (4 vCPUS, 15 GB RAM) compute w/ 1 NVIDIA T4 GPU
# dependencies
#!pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
#!pip install transformers datasets evaluate rouge-score nltk py7zr
#nltk.download("punkt")
# note: installing an older version of pytorch so that cuda versions match
# note: py7zr is needed for the `samsum` dataset, may or may not be needed for other datasets
#!pip install tensorflow
from __future__ import absolute_import, division, print_function
import collections
import math
import glob
import json
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from transformers import WEIGHTS_NAME, BertConfig, BertForQuestionAnswering, BertTokenizer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import logging
import tensorflow as tf
logging.set_verbosity_error()
logging.get_verbosity()
# from utils import (get_answer, input_to_squad_example,
# squad_examples_to_features, to_list)
40
df_analysis = pd.read_parquet('df_analysis.parquet', engine='pyarrow')
#adding this now to do sentiment over time - get sentiment scores
df_filtered_sample = df_analysis.sample(n=1000, random_state=1)
df_filtered_sample.shape
(1000, 5)
from transformers import BertTokenizer, BertForSequenceClassification
import torch
# Load pretrained model and tokenizer
model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
# Define the maximum length for the text chunks. BERT typically works with a max_length of 512.
max_length = 512
# Function to apply sentiment analysis and manage text length
def get_sentiment_and_score(text):
# Tokenize the text
inputs = tokenizer(text, truncation=True, padding='longest', max_length=max_length, return_tensors='pt')
# Get the model's output
output = model(**inputs)
# Get the sentiment
sentiment = output.logits.argmax(dim=1).item()
# Convert logits to probabilities
probabilities = torch.nn.functional.softmax(output.logits, dim=-1)
# Get the confidence score
confidence_score = probabilities.max(dim=-1).values.item()
# Convert the sentiment label from integer to string
if sentiment == 0:
sentiment = 'negative'
elif sentiment == 1:
sentiment = 'neutral'
else:
sentiment = 'positive'
return sentiment, confidence_score
# Apply the function to the 'cleaned_text' column
df_filtered_sample['sentiment_finbert_title'], df_filtered_sample['confidence_score'] = zip(*df_filtered_sample['cleaned_text'].map(get_sentiment_and_score))
#value counts for sentiment column
df_filtered_sample['sentiment'].value_counts()
sentiment neutral 741 positive 259 Name: count, dtype: int64
import torch
from transformers import pipeline
# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda')
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
print(sentiment_analysis("I love this!"))
Downloading (…)lve/main/config.json: 0%| | 0.00/687 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/1.42G [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/256 [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/150 [00:00<?, ?B/s]
[{'label': 'POSITIVE', 'score': 0.9988656044006348}]
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['cleaned_text'][i:i + batch_size] for i in range(0,df_filtered_sample['cleaned_text'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [1 if result['label'] == 'positive' else 0 for result in results]
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
df_filtered_sample['sentiment'] = sentiments
CPU times: user 2h 42min 58s, sys: 2.37 s, total: 2h 43min Wall time: 10min 21s
df_filtered_sample['sentiment'].value_counts()
0 1000 Name: sentiment, dtype: int64
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['cleaned_text'][i:i + batch_size] for i in range(0,df_filtered_sample['cleaned_text'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [result['label'] for result in results] # Directly use the model's output
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
df_filtered_sample['sentiment_roberta_base'] = sentiments
CPU times: user 51min 41s, sys: 875 ms, total: 51min 42s Wall time: 3min 15s
df_filtered_sample['sentiment_roberta_base'].value_counts()
#800 neutral for title
LABEL_1 987 LABEL_0 7 LABEL_2 6 Name: sentiment_roberta_base, dtype: int64
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="shashanksrinath/News_Sentiment_Analysis")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['clean_title'][i:i + batch_size] for i in range(0,df_filtered_sample['clean_title'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [result['label'] for result in results] # Directly use the model's output
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
df_filtered_sample['sentiment_srikanth_title'] = sentiments
Downloading (…)lve/main/config.json: 0%| | 0.00/964 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/499M [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/1.54k [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.11M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/280 [00:00<?, ?B/s]
CPU times: user 5min 29s, sys: 1.04 s, total: 5min 30s Wall time: 30.6 s
df_filtered_sample['sentiment_srikanth_title'].value_counts()
#1000 neutral for text
Neutral 923 Positive 40 Negative 37 Name: sentiment_srikanth_title, dtype: int64
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="shashanksrinath/News_Sentiment_Analysis")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['cleaned_text'][i:i + batch_size] for i in range(0,df_filtered_sample['cleaned_text'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [result['label'] for result in results] # Directly use the model's output
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
df_filtered_sample['sentiment_srikanth'] = sentiments
CPU times: user 51min, sys: 1.08 s, total: 51min 1s Wall time: 3min 13s
df_filtered_sample['sentiment_srikanth'].value_counts()
#923 neurtral for title
Neutral 1000 Name: sentiment_srikanth, dtype: int64
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest",tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['cleaned_text'][i:i + batch_size] for i in range(0,df_filtered_sample['cleaned_text'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [result['label'] for result in results] # Directly use the model's output
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
df_filtered_sample['roberta_latest'] = sentiments
Downloading (…)lve/main/config.json: 0%| | 0.00/929 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/501M [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 0%| | 0.00/899k [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]
CPU times: user 50min 39s, sys: 1.76 s, total: 50min 41s Wall time: 3min 18s
df_filtered_sample['roberta_latest'].value_counts()
neutral 986 positive 10 negative 4 Name: roberta_latest, dtype: int64
%%time
from transformers import pipeline
# Initialize the pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest",tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest")
# Split your data into batches
batch_size = 100
batches = [df_filtered_sample['clean_title'][i:i + batch_size] for i in range(0,df_filtered_sample['clean_title'].shape[0],batch_size)]
# Apply the model to each batch and combine the results
sentiments = []
for batch in batches:
results = sentiment_analysis(list(batch), truncation=True, padding='longest', max_length=512)
batch_sentiments = [result['label'] for result in results] # Directly use the model's output
sentiments.extend(batch_sentiments)
# Assign the results to your dataframe
# df_filtered_sample['roberta_latest_title'] = sentiments
CPU times: user 4min 18s, sys: 1.24 s, total: 4min 19s Wall time: 25.4 s
df_filtered_sample['roberta_latest_title'].value_counts()
neutral 826 positive 104 negative 70 Name: roberta_latest_title, dtype: int64
thresholds = {
'LABEL_0': 0.05, # Negative threshold
'LABEL_1': 0.9, # Neutral threshold
'LABEL_2': 0.05, # Positive threshold
}
def get_sentiment(text):
# Preprocess and encode the text
encoded_input = tokenizer(text, truncation=True, max_length=512, return_tensors='pt')
# Get the model's output
output = model(**encoded_input)
scores = output.logits.detach().numpy()
# Convert logits to probabilities
probabilities = softmax(scores, axis=-1)[0]
# Create a dictionary mapping labels to their probabilities
label_probs = {f'LABEL_{i}': prob for i, prob in enumerate(probabilities)}
# Assign the label with the highest probability that exceeds its threshold
sentiment_label = 'unclassified' # Default label if no thresholds are exceeded
for label, prob in label_probs.items():
if prob > thresholds[label]: # Check if the probability exceeds the threshold
sentiment_label = label
break # Stop checking other labels
return sentiment_label
%%time
# Apply the function to the 'cleaned_title' column
df_filtered_sample['roberta_latest_title'] = df_filtered_sample['clean_title'].apply(get_sentiment)
CPU times: user 5min 41s, sys: 200 ms, total: 5min 41s Wall time: 21.4 s
df_filtered_sample['roberta_latest_title'].value_counts()
LABEL_2 605 LABEL_0 290 LABEL_1 105 Name: roberta_latest_title, dtype: int64
#df_filtered_sample[df_filtered_sample['roberta_latest_title'] == 'LABEL_0'].head()
#df_filtered_sample[df_filtered_sample['roberta_latest_title'] == 'LABEL_2'].head()
df_analysis = pd.read_parquet('df_analysis.parquet', engine='pyarrow')
df_analysis.shape
(147680, 5)
df_analysis_sample = df_analysis.sample(n=147680) # Adjust the sample size as needed
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
columns_to_drop = ['language', 'title', 'text', 'content_clean']
df_analysis = df_filtered.drop(columns=columns_to_drop)
%%time
df_analysis['roberta_latest_title_sentiment'] = df_analysis['clean_title'].apply(get_sentiment)
CPU times: user 15h 42min 12s, sys: 33.8 s, total: 15h 42min 46s Wall time: 58min 55s
df_analysis['roberta_latest_title_sentiment'].value_counts()
#0 is negative
#2 is positive
#1 is neutral
LABEL_2 100996 LABEL_0 46684 LABEL_1 16417 Name: roberta_latest_title_sentiment, dtype: int64
df_analysis.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 164097 entries, 0 to 198562 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 164097 non-null object 1 date 164097 non-null object 2 clean_title 164097 non-null object 3 cleaned_text 164097 non-null object dtypes: object(4) memory usage: 6.3+ MB
df_filtered.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 164097 entries, 0 to 198562 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 164097 non-null object 1 date 164097 non-null object 2 language 164097 non-null object 3 title 164097 non-null object 4 text 164097 non-null object 5 content_clean 164097 non-null object 6 clean_title 164097 non-null object 7 cleaned_text 164097 non-null object dtypes: object(8) memory usage: 11.3+ MB
# bert topic on the sample above
#!pip install bertopic
from bertopic import BERTopic
pd.set_option('display.max_colwidth', 500)
docs = df_analysis['cleaned_text'].tolist()
%%time
topic_model = BERTopic(language="english", min_topic_size=100, n_gram_range=(1,2), calculate_probabilities=False, verbose=True)
topics, probs = topic_model.fit_transform(docs)
Downloading (…)e9125/.gitattributes: 0%| | 0.00/1.18k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
Downloading (…)7e55de9125/README.md: 0%| | 0.00/10.6k [00:00<?, ?B/s]
Downloading (…)55de9125/config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
Downloading (…)ce_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
Downloading (…)125/data_config.json: 0%| | 0.00/39.3k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/90.9M [00:00<?, ?B/s]
Downloading (…)nce_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
Downloading (…)e9125/tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]
Downloading (…)9125/train_script.py: 0%| | 0.00/13.2k [00:00<?, ?B/s]
Downloading (…)7e55de9125/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
Downloading (…)5de9125/modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
Batches: 0%| | 0/5129 [00:00<?, ?it/s]
2023-05-19 08:32:27,334 - BERTopic - Transformed documents to Embeddings 2023-05-19 08:34:44,446 - BERTopic - Reduced dimensionality 2023-05-19 08:35:01,346 - BERTopic - Clustered reduced embeddings
CPU times: user 1h 11min 18s, sys: 13min 18s, total: 1h 24min 37s Wall time: 30min 57s
#this is calculate probabilities = True - only if you have less documents
# %%time
# # topic_model = BERTopic(language="english", min_topic_size=100, n_gram_range=(1,2), calculate_probabilities=True, verbose=True)
# topics, probs = topic_model.fit_transform(docs)
Batches: 0%| | 0/32 [00:00<?, ?it/s]
2023-05-18 20:46:57,224 - BERTopic - Transformed documents to Embeddings 2023-05-18 20:47:04,316 - BERTopic - Reduced dimensionality 2023-05-18 20:47:04,526 - BERTopic - Clustered reduced embeddings
CPU times: total: 9min 18s Wall time: 2min 52s
freq = topic_model.get_topic_info()
print(f"Topics found: {freq.shape[0]}")
freq.head(20)
#266 topics found
Topics found: 266
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 68052 | -1_ai_news_data_new |
| 1 | 0 | 7351 | 0_gray_gray media_media group_fund |
| 2 | 1 | 2940 | 1_market_analysis_global_growth |
| 3 | 2 | 2689 | 2_ment_cision_products_overview |
| 4 | 3 | 2272 | 3_chatgpt_openai_gpt_chatbot |
| 5 | 4 | 2236 | 4_market_artificial intelligence_artificial_intelligence |
| 6 | 5 | 2165 | 5_us_newswires_presswire_ein |
| 7 | 6 | 1729 | 6_npr_radio_schedule_donate |
| 8 | 7 | 1627 | 7_days_day_hours_may also |
| 9 | 8 | 1367 | 8_venturebeat_follow_follow us_vb |
| 10 | 9 | 1323 | 9_wfmz_wfmz tv_tv_lehigh |
| 11 | 10 | 1218 | 10_bing_microsoft_openai_chatgpt |
| 12 | 11 | 1061 | 11_ai_human_humans_intelligence |
| 13 | 12 | 1044 | 12_students_chatgpt_education_teachers |
| 14 | 13 | 1012 | 13_ct_chatgpt_students_school |
| 15 | 14 | 1007 | 14_paid program_paid_brandvoice_brandvoice paid |
| 16 | 15 | 1002 | 15_military_defense_force_dod |
| 17 | 16 | 976 | 16_machine learning_machine_learning_learning market |
| 18 | 17 | 922 | 17_covid_coronavirus_virus_health |
| 19 | 18 | 899 | 18_und_zu_die_hoc |
topic_model.get_topic(0) # Select the most frequent topic
[('gray', 0.007091426161136105),
('gray media', 0.0058789357950799976),
('media group', 0.005795778242709462),
('fund', 0.00578512053063246),
('prnewswire', 0.005234564722450711),
('group', 0.00502938227527594),
('ai', 0.004619169760825638),
('inc', 0.004490669368212461),
('group inc', 0.004454445690367658),
('press release', 0.0043550681738023555)]
topic_model.get_topic(1) # Select the most frequent topic
[('market', 0.018099644171155383),
('analysis', 0.010799928169534559),
('global', 0.009880534226911732),
('growth', 0.009762054240905335),
('report', 0.009636360758209145),
('players', 0.008618198408822181),
('forecast', 0.008037437129280343),
('key', 0.007947895814556622),
('trends', 0.006917526317881663),
('artificial intelligence', 0.0067421328819746934)]
topic_model.get_topic(4)
[('market', 0.020022401328726697),
('artificial intelligence', 0.014153443989317399),
('artificial', 0.014029064550587212),
('intelligence', 0.013742644967803043),
('report', 0.011640485276490361),
('analysis', 0.011443175454969482),
('growth', 0.010096051278654598),
('global', 0.009964343512412066),
('global artificial', 0.009088895392028373),
('forecast', 0.008530568818852252)]
#cant visualize topic probability distribution because probabilities not calculated for too many documents
# %%time
# #
# topic_model.visualize_distribution(probs[100], min_probability=0.001)
topic_model.visualize_hierarchy(top_n_topics=50)
topic_model.visualize_barchart(top_n_topics=5)
topic_model.visualize_heatmap(n_clusters=1, width=1000, height=1000)
%%time
topic_model.visualize_topics()
#
CPU times: user 19min 15s, sys: 1min 7s, total: 20min 23s Wall time: 20min 22s
topic_model.visualize_term_rank()
%%time
new_topics = topic_model.reduce_topics(docs, nr_topics=90)
#topic_mod.reduce_topics(docs, nr_topics = 3)
2023-05-19 14:35:36,495 - BERTopic - Reduced number of topics from 266 to 37
CPU times: user 8min 59s, sys: 6.26 s, total: 9min 5s Wall time: 9min 1s
freq = new_topics.get_topic_info()
print(f"Topics found: {freq.shape[0]}")
freq.head(20)
#xx topics found
Topics found: 37
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 68052 | -1_ai_news_data_new |
| 1 | 0 | 30691 | 0_us_ai_news_new |
| 2 | 1 | 25225 | 1_market_ai_intelligence_data |
| 3 | 2 | 7438 | 2_ai_days_ago_new |
| 4 | 3 | 3130 | 3_venturebeat_follow_ai_follow us |
| 5 | 4 | 3108 | 4_ai_market_news_retail |
| 6 | 5 | 3060 | 5_starfilled_ai_data_starfilled starfilled |
| 7 | 6 | 2651 | 6_ai_news_ddn_arrow |
| 8 | 7 | 2158 | 7_chatgpt_students_ai_new |
| 9 | 8 | 1907 | 8_drug_market_discovery_edge |
| 10 | 9 | 1891 | 9_best_betting_casinos_casino |
| 11 | 10 | 1823 | 10_india_news_ai_vs |
| 12 | 11 | 1795 | 11_ai_intelligence_artificial_market |
| 13 | 12 | 1538 | 12_icon_musk_tesla_insider |
| 14 | 13 | 1357 | 13_market_security_security market_cyber |
| 15 | 14 | 1058 | 14_automotive_market_automotive artificial_artificial intelligence |
| 16 | 15 | 840 | 15_biometrics_climate_biometric_ai |
| 17 | 16 | 716 | 16_digi_digi communications_communications_communications reports |
| 18 | 17 | 625 | 17_credo_credo ai_ai_space |
| 19 | 18 | 563 | 18_renalytix_renalytix ai_average_moving average |
new_topics.get_topic(17)
[('credo', 0.03134197090312124),
('credo ai', 0.029289637894103982),
('ai', 0.021364997309663143),
('space', 0.01223210872399148),
('governance', 0.011082271732933127),
('responsible ai', 0.01032511197376296),
('gatc', 0.008903344752079982),
('responsible', 0.008721795149884727),
('health', 0.008276982884971964),
('nasa', 0.0077473647106864165)]
new_topics.get_topic(3)
[('venturebeat', 0.01597188872780595),
('follow', 0.014127602903932637),
('ai', 0.012981971075535325),
('follow us', 0.010469748687904267),
('data', 0.010306584274660111),
('us', 0.010173411915868066),
('military', 0.008634568083044592),
('intelligence', 0.007597319756642422),
('market', 0.0073108331960240955),
('share', 0.006798935451960033)]
new_topics.get_topic(14)
[('automotive', 0.030675455368951512),
('market', 0.02592497220755994),
('automotive artificial', 0.021189671643562797),
('artificial intelligence', 0.01878315727302124),
('artificial', 0.018721241311084953),
('intelligence', 0.018682329442605117),
('argo', 0.017459316139063513),
('autonomous', 0.012514170985267381),
('report', 0.011301534805286352),
('vehicle', 0.010649274380783315)]
new_topics.get_topic(12)
[('icon', 0.02278914010317371),
('musk', 0.02124157090996323),
('tesla', 0.0139380790613448),
('insider', 0.013328098772487775),
('ai', 0.012756757538290059),
('indicates', 0.011504085454988409),
('elon', 0.010440159853737469),
('elon musk', 0.00997111861503895),
('arize', 0.008510162294776118),
('news', 0.007805877380261475)]
new_topics.get_topic(10)
[('india', 0.018917263293555153),
('news', 0.010147797883290842),
('ai', 0.00807539017012378),
('vs', 0.007565545891341972),
('dh', 0.007293793904058606),
('live', 0.00723433961280067),
('world', 0.0064906908150264),
('day', 0.00614038911634154),
('delhi', 0.005938258039939079),
('watch', 0.005898813702523529)]
new_topics.get_topic(0)
[('us', 0.010734809484346389),
('ai', 0.010040996436923091),
('news', 0.008066518529668791),
('new', 0.006582928783344031),
('chatgpt', 0.004908856072202207),
('intelligence', 0.00490418209487045),
('data', 0.004751174246006199),
('market', 0.004619518292630521),
('artificial', 0.004521241407571895),
('technology', 0.004476860592960639)]
new_topics.visualize_hierarchy(top_n_topics=50)
path_lda = '/home/jupyter'
# Save model - 37 topics
new_topics.save(path_lda +'/lda_model')
# Load model
#saved_model = BERTopic.load(path_lda +'/lda_model')
# Save model - 266 topics
topic_model.save(path_lda +'/lda_model_266')
# Load model
#saved_model = BERTopic.load(path_lda +'/lda_model')
#df_analysis = pd.read_parquet('df_analysis.parquet', engine='pyarrow')
#- seperated from df_analysis into positive and negative
#df_positive = pd.read_parquet('df_positive.parquet', engine='pyarrow')
#df_negative = pd.read_parquet('df_negative.parquet', engine='pyarrow')
#lda_model_266 = topics on 37 topics of general corpus (it is 37 because i probably reduced it)
#lda_model_266 = topics on 37 topics of general corpus
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
from tqdm import tqdm
spacy.prefer_gpu()
#spacy.require_gpu()
print(spacy.__version__)
2023-05-25 18:41:34.839269: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-05-25 18:41:36.360839: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2023-05-25 18:41:36.364336: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2023-05-25 18:41:36.371640: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2023-05-25 18:41:36.374492: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2023-05-25 18:41:36.377220: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2023-05-25 18:41:36.379822: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
3.5.3
df_positive = pd.read_parquet('df_positive.parquet', engine='pyarrow')
spacy_content_df = pd.read_parquet('content_clean_spacy_final_project.parquet', engine='pyarrow')
spacy_content_df.shape
(162486, 6)
# nlp = spacy.load("en_core_web_sm",exclude=['tok2vec','lemmatizer'])
nlp = spacy.load("en_core_web_lg",exclude=['tok2vec','lemmatizer'])
#add spacy text (not lower cased and unclean) to the negative topics dataframe
df_positive = pd.merge(df_positive, spacy_content_df[["content_clean", "url"]], left_on="url", right_on="url", how="inner")
#df_topics_negative.drop("cleaned_text", axis=1, inplace=True)
df_positive['date'] = pd.to_datetime(df_positive['date'])
df_positive.shape
(100039, 6)
df_analysis_sample = df_positive
#df_analysis_sample = df_analysis_sample.sample(n=100039)
%%time
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 23min 25s, sys: 20.2 s, total: 23min 45s Wall time: 1h 27min 7s
# Example: Accessing the top entities for 'ORG'
top_org_entities_pos = df_entities_by_label['ORG']
top_person_entities_pos = df_entities_by_label['PERSON']
top_product_entities_pos = df_entities_by_label['PRODUCT']
top_org_entities_pos.head(5)
| 2020 | 2021 | 2022 | 2023 | |
|---|---|---|---|---|
| AI | 21384.0 | 31597.0 | 44393.0 | 40222.0 |
| Artificial Intelligence | 14745.0 | 14808.0 | 6118.0 | 3192.0 |
| 11417.0 | 11160.0 | 9813.0 | 29527.0 | |
| Microsoft | 10188.0 | 10408.0 | 5031.0 | 31330.0 |
| IBM | 9972.0 | 10224.0 | 6448.0 | 1312.0 |
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Reset the index of the DataFrame so 'year' becomes a column
top_person_entities_neg_reset = top_org_entities_pos.reset_index()
# Rename the columns for clarification
top_person_entities_neg_reset = top_person_entities_neg_reset.rename(columns={'index': 'Entity'})
# Melt the DataFrame to make it suitable for plotting
df_melted = top_person_entities_neg_reset.melt(id_vars='Entity', var_name='Year', value_name='Count')
# Ensure 'Year' is integer for plotting
df_melted['Year'] = df_melted['Year'].astype(int)
# Select the top 10 entities for each year based on count
top_10_each_year = df_melted.groupby('Year').apply(lambda x: x.nlargest(10, 'Count')).reset_index(drop=True)
# Define a custom color palette for better differentiation of lines
custom_palette = sns.color_palette("colorblind", n_colors=len(top_10_each_year['Entity'].unique()))
# Create line plot with seaborn and custom palette
plt.figure(figsize=(15, 10))
sns.lineplot(data=top_10_each_year, x='Year', y='Count', hue='Entity', palette=custom_palette, linewidth=4.5)
# Set plot title and labels
plt.title('Top 10 Orgs by Year', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Persons', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
# Set y-axis limit
plt.ylim(0, 100000)
# Customize ticks and tick labels
plt.xticks([2020, 2021, 2022, 2023], fontsize=12)
plt.yticks(fontsize=12)
# Remove the right and top spines
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['top'].set_visible(True)
# Adjust legend location
plt.legend(title='Persons', bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=12)
# Set the style and context using Seaborn
sns.set_style("darkgrid")
sns.set_context("notebook")
# Set background color
plt.gca().set_facecolor('#000000')
plt.tight_layout()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Reset the index of the DataFrame so 'year' becomes a column
top_person_entities_neg_reset = top_person_entities_pos.reset_index()
# Rename the columns for clarification
top_person_entities_neg_reset = top_person_entities_neg_reset.rename(columns={'index': 'Entity'})
# Melt the DataFrame to make it suitable for plotting
df_melted = top_person_entities_neg_reset.melt(id_vars='Entity', var_name='Year', value_name='Count')
# Ensure 'Year' is integer for plotting
df_melted['Year'] = df_melted['Year'].astype(int)
# Select the top 10 entities for each year based on count
top_10_each_year = df_melted.groupby('Year').apply(lambda x: x.nlargest(7, 'Count')).reset_index(drop=True)
# Define a custom color palette for better differentiation of lines
custom_palette = sns.color_palette("colorblind", n_colors=len(top_10_each_year['Entity'].unique()))
# Create line plot with seaborn and custom palette
plt.figure(figsize=(15, 10))
sns.lineplot(data=top_10_each_year, x='Year', y='Count', hue='Entity', palette=custom_palette, linewidth=4.5)
# Set plot title and labels
plt.title('Top 10 Persons by Year', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Persons', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
# Set y-axis limit
plt.ylim(0, 6000)
# Customize ticks and tick labels
plt.xticks([2020, 2021, 2022, 2023], fontsize=12)
plt.yticks(fontsize=12)
# Remove the right and top spines
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['top'].set_visible(True)
# Adjust legend location
plt.legend(title='Persons', bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=12)
# Set the style and context using Seaborn
sns.set_style("darkgrid")
sns.set_context("notebook")
# Set background color
plt.gca().set_facecolor('#000000')
plt.tight_layout()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Reset the index of the DataFrame so 'year' becomes a column
top_person_entities_neg_reset = top_product_entities_pos.reset_index()
# Rename the columns for clarification
top_person_entities_neg_reset = top_person_entities_neg_reset.rename(columns={'index': 'Entity'})
# Melt the DataFrame to make it suitable for plotting
df_melted = top_person_entities_neg_reset.melt(id_vars='Entity', var_name='Year', value_name='Count')
# Ensure 'Year' is integer for plotting
df_melted['Year'] = df_melted['Year'].astype(int)
# Select the top 10 entities for each year based on count
top_10_each_year = df_melted.groupby('Year').apply(lambda x: x.nlargest(7, 'Count')).reset_index(drop=True)
# Define a custom color palette for better differentiation of lines
custom_palette = sns.color_palette("colorblind", n_colors=len(top_10_each_year['Entity'].unique()))
# Create line plot with seaborn and custom palette
plt.figure(figsize=(15, 10))
sns.lineplot(data=top_10_each_year, x='Year', y='Count', hue='Entity', palette=custom_palette, linewidth=4.5)
# Set plot title and labels
plt.title('Top 10 Products by Year', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Persons', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
# Set y-axis limit
plt.ylim(0, 6000)
# Customize ticks and tick labels
plt.xticks([2020, 2021, 2022, 2023], fontsize=12)
plt.yticks(fontsize=12)
# Remove the right and top spines
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['top'].set_visible(True)
# Adjust legend location
plt.legend(title='Persons', bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=12)
# Set the style and context using Seaborn
sns.set_style("darkgrid")
sns.set_context("notebook")
# Set background color
plt.gca().set_facecolor('#000000')
plt.tight_layout()
plt.show()
# top_org_entities_pos.columns = ['2020', '2021', '2022', '2023']
# top_person_entities_pos.columns = ['2020', '2021', '2022', '2023']
# top_product_entities_pos.columns = ['2020', '2021', '2022', '2023']
# top_org_entities_pos.to_parquet('top_org_entities_pos.parquet', engine='pyarrow')
# top_person_entities_pos.to_parquet('top_person_entities_pos.parquet', engine='pyarrow')
# top_product_entities_pos.to_parquet('top_product_entities_pos.parquet', engine='pyarrow')
#removing neutrals
df_analysis = df_analysis[df_analysis['roberta_latest_title_sentiment'] != 'LABEL_1']
df_analysis.to_parquet('df_analysis.parquet', engine='pyarrow')
df_positive = df_analysis[df_analysis['roberta_latest_title_sentiment'] == 'LABEL_2']
#df_positive.to_parquet('df_positive.parquet', engine='pyarrow')
df_positive.shape
(100996, 5)
%%time
docs = df_positive['cleaned_text'].tolist()
CPU times: user 105 ms, sys: 369 ms, total: 475 ms Wall time: 469 ms
%%time
topic_model_positive = BERTopic(language="english", min_topic_size=30, n_gram_range=(1,2), calculate_probabilities=False, verbose=True)
topics, probs = topic_model_positive.fit_transform(docs)
Batches: 0%| | 0/3157 [00:00<?, ?it/s]
2023-05-19 15:37:45,889 - BERTopic - Transformed documents to Embeddings 2023-05-19 15:38:59,421 - BERTopic - Reduced dimensionality 2023-05-19 15:39:07,756 - BERTopic - Clustered reduced embeddings
CPU times: user 46min 52s, sys: 5min 55s, total: 52min 47s Wall time: 20min 39s
%%time
#running again because after reduction to 50 topics even after changing variable name you cannot access previous variable
topic_model_positive = BERTopic(language="english", min_topic_size=30, n_gram_range=(1,2), calculate_probabilities=False, verbose=True)
topics, probs = topic_model_positive.fit_transform(docs)
Batches: 0%| | 0/3127 [00:00<?, ?it/s]
2023-05-26 06:31:38,542 - BERTopic - Transformed documents to Embeddings 2023-05-26 06:32:47,132 - BERTopic - Reduced dimensionality 2023-05-26 06:32:56,026 - BERTopic - Clustered reduced embeddings
CPU times: user 44min 13s, sys: 6min 37s, total: 50min 50s Wall time: 21min 39s
#checking with below because it was rerun
freq = topic_model_positive.get_topic_info()
print(f"Topics found: {freq.shape[0]}")
freq.head(20)
#xx topics found
Topics found: 754
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 29299 | -1_us_ai_news_new |
| 1 | 0 | 2280 | 0_market_analysis_players_growth |
| 2 | 1 | 1975 | 1_ment_cision_entertain ment_entertain |
| 3 | 2 | 1044 | 2_laivly_gray_gray media_media group |
| 4 | 3 | 907 | 3_market_intelligence software_artificial intelligence_artificial |
| 5 | 4 | 810 | 4_learning market_machine learning_machine_learning |
| 6 | 5 | 803 | 5_wfmz_wfmz tv_lehigh_berks |
| 7 | 6 | 778 | 6_und_zu_die_hoc |
| 8 | 7 | 758 | 7_chatgpt_gpt_openai_chatbot |
| 9 | 8 | 713 | 8_days_software days_technology days_may also |
| 10 | 9 | 546 | 9_nvidia_gpus_gpu_dgx |
| 11 | 10 | 543 | 10_venturebeat_follow us_follow_us rss |
| 12 | 11 | 529 | 11_paid program_brandvoice_brandvoice paid_paid |
| 13 | 12 | 497 | 12_intelligence healthcare_healthcare market_healthcare_intelligence medicine |
| 14 | 13 | 458 | 13_computer vision_vision market_vision_computer |
| 15 | 14 | 452 | 14_ment_cision_entertain ment_entertain |
| 16 | 15 | 409 | 15_customer_conversational_crm_conversational ai |
| 17 | 16 | 394 | 16_npr_dall_radio_donate |
| 18 | 17 | 378 | 17_india_vs_viral_latestly |
| 19 | 18 | 372 | 18_india_minister_indian_modi |
topic_model_positive.get_topic(0) # Select the most frequent topic
[('market', 0.006351688447625568),
('analysis', 0.004923013611003048),
('players', 0.0047061695399391165),
('growth', 0.004649534207824899),
('report', 0.004449352784590002),
('global', 0.004313027856453599),
('forecast', 0.004172499522282458),
('key', 0.004056633436015475),
('corporation', 0.003824824104588242),
('trends', 0.0037736808075665936)]
topic_model_positive.visualize_hierarchy(top_n_topics=50)
topic_model_positive.visualize_barchart(top_n_topics=5)
new_topics_positive = topic_model_positive.reduce_topics(docs, nr_topics=310)
#topic_mod.reduce_topics(docs, nr_topics = 3)
2023-05-19 17:31:42,009 - BERTopic - Reduced number of topics from 755 to 310
freq = new_topics_positive.get_topic_info()
print(f"Topics found: {freq.shape[0]}")
#freq.head(200)
#xx topics found
# Set the display options to show more rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Print the dataframe
# freq.head(200)
Topics found: 310
# Save model - 310 topics_positive
new_topics_positive.save(path_lda +'/lda_model_positive_310')
# Load model
#saved_model = BERTopic.load(path_lda +'/lda_model')
#leverage this below is key for this topic search
#topic_model.get_document_info(docs)
df_topics_positive = new_topics_positive.get_document_info(docs)
#removing outliers
df_topics_positive = df_topics_positive[df_topics_positive['Topic'] != -1]
df_topics_positive["Representative_document"].value_counts()
#after removing -1 (outliers)
False 100045 True 951 Name: Representative_document, dtype: int64
#df_topics_positive.to_parquet('df_topics_positive.parquet', engine='pyarrow')
new_topics_positive = BERTopic.load(path_lda +'/lda_model_positive_310')
path_lda
'/home/jupyter'
df_topics_positive = pd.read_parquet('df_topics_positive.parquet', engine='pyarrow')
topic_counts = df_topics_positive['Topic'].value_counts()
print(topic_counts)
0 3799
1 3414
2 2833
3 1700
4 1688
...
303 31
306 31
305 31
307 30
308 30
Name: Topic, Length: 309, dtype: int64
similar_topics, similarity = new_topics_positive.find_topics("chat gpt", top_n=10); similar_topics
[6, 237, 184, 29, 58, 198, 59, 122, 2, 91]
similar_topics, similarity = new_topics_positive.find_topics("chatgpt", top_n=10); similar_topics
[6, 237, 29, 184, 26, 302, 58, 122, 62, 91]
similar_topics, similarity = new_topics_positive.find_topics("gpt", top_n=10); similar_topics
[6, 237, 198, 58, 59, 122, 12, 2, 90, 141]
#chatgpt
selected_topics = [6, 237, 184, 29, 58, 198, 59, 122, 2, 91]
chatgpt_df = df_topics_positive[df_topics_positive['Topic'].isin(selected_topics)]
# chatgpt_df.head(1)
# chatgpt_df.shape
from collections import Counter
# Initialize empty lists to store counts
positive_counts = []
negative_counts = []
# Loop through each document
for index, row in chatgpt_df.iterrows():
positive_sentences = row['Positive_Sentences']
negative_sentences = row['Negative_Sentences']
# Count the number of positive and negative sentences
positive_count = len(positive_sentences)
negative_count = len(negative_sentences)
# Append the counts to the respective lists
positive_counts.append(positive_count)
negative_counts.append(negative_count)
# Add the counts as separate columns in the DataFrame
chatgpt_df['Positive_Sentence_Count'] = positive_counts
chatgpt_df['Negative_Sentence_Count'] = negative_counts
from nltk.sentiment import SentimentIntensityAnalyzer
# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Function to perform aspect-based sentiment analysis
def perform_aspect_sentiment_analysis(text):
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)
# Analyze sentiment for each sentence
sentiment_scores = []
for sentence in sentences:
sentiment = sia.polarity_scores(sentence)
sentiment_scores.append(sentiment['compound'])
# Return the average sentiment score for the text
return sum(sentiment_scores) / len(sentiment_scores)
# Apply aspect-based sentiment analysis to the 'text' column of your DataFrame
chatgpt_df_1['aspect_sentiment'] = chatgpt_df_1['Document'].apply(perform_aspect_sentiment_analysis)
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Function to perform targeted sentiment analysis
def perform_targeted_sentiment_analysis(row, target_entity):
text = row['Document']
sentiment_scores = analyzer.polarity_scores(text)
compound_score = sentiment_scores['compound']
# Check if the target entity is mentioned in the text
if target_entity.lower() in text.lower():
if compound_score >= 0.05:
sentiment = 'positive'
elif compound_score <= -0.05:
sentiment = 'negative'
else:
sentiment = 'neutral'
return sentiment
else:
return 'not mentioned'
# Example target entity
target_entity = "innovation"
# Perform targeted sentiment analysis on the chatgpt_df dataframe
chatgpt_df['Sentiment_innovation'] = chatgpt_df.apply(perform_targeted_sentiment_analysis, target_entity=target_entity, axis=1)
# Interpret the sentiment analysis results
sentiment_counts = chatgpt_df['Sentiment_innovation'].value_counts()
positive_count = sentiment_counts.get('positive', 0)
negative_count = sentiment_counts.get('negative', 0)
neutral_count = sentiment_counts.get('neutral', 0)
# Calculate the percentage of positive, negative, and neutral sentiment
total_count = len(chatgpt_df)
positive_percentage = (positive_count / total_count) * 100
negative_percentage = (negative_count / total_count) * 100
neutral_percentage = (neutral_count / total_count) * 100
# Interpretation of sentiment analysis results
interpretation = f"In the {total_count} analyzed documents, the sentiment towards {target_entity} is as follows:\n"
interpretation += f"Positive: {positive_count} documents ({positive_percentage:.2f}%)\n"
interpretation += f"Negative: {negative_count} documents ({negative_percentage:.2f}%)\n"
interpretation += f"Neutral: {neutral_count} documents ({neutral_percentage:.2f}%)"
# Print the interpretation
print(interpretation)
In the 5882 analyzed documents, the sentiment towards innovation is as follows: Positive: 1681 documents (28.58%) Negative: 11 documents (0.19%) Neutral: 0 documents (0.00%)
new_topics_positive_50 = topic_model_positive.reduce_topics(docs, nr_topics=100)
#topic_mod.reduce_topics(docs, nr_topics = 3)
2023-05-26 06:46:50,166 - BERTopic - Reduced number of topics from 754 to 100
df_topics_positives_100 = new_topics_positive_50.get_document_info(docs)
df_topics_positives_100.to_parquet('df_topics_positives_100.parquet', engine='pyarrow')
freq = new_topics_positive_50.get_topic_info()
print(f"Topics found: {freq.shape[0]}")
#freq.head(200)
#xx topics found
# Set the display options to show more rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Print the dataframe
freq.head(100)
#topic 2 - chatgpt
#topic 4 - healthcare (topic 19 - drug discovery)(topic 24 - covid coronavirus)
#topic 5 - computer vision
#topic 7 - automative / tesla
#topic 8 - nvidia
#topic 9 - military intelligence aviation defense
#topic 11 - intelligence education
#topic 15 - cybersecurity
#topic 21 - retail fashion
#topic 22 - iot
#topic 28 - agriculture
#topic 31 - energy intelligence
#topic 32 - food intellgience ai
#topic 33 - legal tech
#topic 34 - dentists
#topic 38 - conversational ai
#topic 41 - quantum computing
#topic 58 - insurance
#topic 71 - oil
Topics found: 100
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 29299 | -1_ai_us_news_market |
| 1 | 0 | 21208 | 0_market_ai_data_services |
| 2 | 1 | 7820 | 1_us_news_tv_ai |
| 3 | 2 | 5350 | 2_chatgpt_microsoft_google_news |
| 4 | 3 | 4961 | 3_paid_paid program_fund_brandvoice |
| 5 | 4 | 3366 | 4_healthcare_medical_health_market |
| 6 | 5 | 2189 | 5_vision_computer vision_computer_market |
| 7 | 6 | 1836 | 6_market_intelligence_artificial intelligence_artificial |
| 8 | 7 | 1532 | 7_automotive_musk_tesla_automotive artificial |
| 9 | 8 | 1297 | 8_nvidia_gpu_ai_hpc |
| 10 | 9 | 1160 | 9_military_intelligence aviation_defense_aviation |
| 11 | 10 | 1079 | 10_baidu_china_ai_chinese |
| 12 | 11 | 985 | 11_education_intelligence education_market_ai education |
| 13 | 12 | 980 | 12_days_eurekalert_science_new |
| 14 | 13 | 913 | 13_yext_gartner_ai_arize |
| 15 | 14 | 807 | 14_venturebeat_follow_follow us_data |
| 16 | 15 | 754 | 15_security_security market_market_cyber security |
| 17 | 16 | 657 | 16_aacc_protein_clinical_proteins |
| 18 | 17 | 588 | 17_pm_est_edt_pm est |
| 19 | 18 | 586 | 18_space_earth_new_science |
| 20 | 19 | 586 | 19_drug_discovery_drug discovery_discovery market |
| 21 | 20 | 515 | 20_lunit_gurufocus_guru_buffett |
| 22 | 21 | 495 | 21_retail_retail market_intelligence retail_fashion |
| 23 | 22 | 425 | 22_iot_iot market_intelligence iot_smartcow |
| 24 | 23 | 415 | 23_ip_soc_ieee_ip soc |
| 25 | 24 | 386 | 24_covid_health_coronavirus_pandemic |
| 26 | 25 | 375 | 25_digi_digi communications_communications_communications reports |
| 27 | 26 | 370 | 26_seo_naver_search_product |
| 28 | 27 | 361 | 27_music_music rights_pro music_song |
| 29 | 28 | 360 | 28_agriculture_agriculture market_ai agriculture_market |
| 30 | 29 | 343 | 29_market_chips_chips market_chip |
| 31 | 30 | 340 | 30_oracle_arcspan_tifin_sambanova |
| 32 | 31 | 334 | 31_energy_intelligence energy_ecolibrium_energy market |
| 33 | 32 | 316 | 32_food_ai food_intelligence food_food beverage |
| 34 | 33 | 314 | 33_legaltech_legaltech artificial_legal_law |
| 35 | 34 | 278 | 34_dental_overjet_gatc_dentists |
| 36 | 35 | 260 | 35_humane_species_fish_salmon |
| 37 | 36 | 233 | 36_miko_meta_kids_parents |
| 38 | 37 | 214 | 37_ddn_storage_data_data management |
| 39 | 38 | 213 | 38_conversational_conversational ai_market_ai market |
| 40 | 39 | 208 | 39_ethics_ethical_ai ethics_ai |
| 41 | 40 | 182 | 40_bs_business standard_standard_india |
| 42 | 41 | 166 | 41_quantum_quantum computing_computing_computing technologies |
| 43 | 42 | 165 | 42_epic_superb_superb ai_infinity |
| 44 | 43 | 164 | 43_average_moving average_moving_index |
| 45 | 44 | 161 | 44_qynapse_xtra_movr_fitness |
| 46 | 45 | 157 | 45_hackerrank_constant contact_developers_code |
| 47 | 46 | 155 | 46_water_eleclean_cemai_pani |
| 48 | 47 | 154 | 47_annalise_annalise ai_rad_radiology |
| 49 | 48 | 151 | 48_learning_data_machine_machine learning |
| 50 | 49 | 146 | 49_osf_rokit_rokit healthcare_healthcare |
| 51 | 50 | 139 | 50_holographic_holo_hologram_wimi |
| 52 | 51 | 138 | 51_starfilled_starfilled starfilled_course_data |
| 53 | 52 | 138 | 52_renalytix ai_renalytix_shares_rnlx |
| 54 | 53 | 129 | 53_uae_newsgovernment newspeople_eastafricaindian sub_continentchina asia |
| 55 | 54 | 125 | 54_synthesis_synthetic data_synthetic_synthesis ai |
| 56 | 55 | 121 | 55_stroke_brainomix_hungary_stroke patients |
| 57 | 56 | 118 | 56_bfsi_bfsi market_intelligence bfsi_ai bfsi |
| 58 | 57 | 116 | 57_gi_gastroenterology_cdx_agi |
| 59 | 58 | 113 | 58_insurance_insurance market_ai insurance_claims |
| 60 | 59 | 112 | 59_sparkcognition_sgs_generative_generative ai |
| 61 | 60 | 110 | 60_stadium_intelligence stadium_wimbledon_stadium market |
| 62 | 61 | 107 | 61_learning operationalization_operationalization software_operationalization_logicmanager |
| 63 | 62 | 105 | 62_imerit_datastudio_serve api_imerit datastudio |
| 64 | 63 | 105 | 63_relativity_relativityone_data_text iq |
| 65 | 64 | 101 | 64_diabetes_intelligence diabetes_diabetes management_management market |
| 66 | 65 | 96 | 65_skin_skincare_ai skin_cetaphil |
| 67 | 66 | 95 | 66_emotion_emotion recognition_intelligence emotion_recognition market |
| 68 | 67 | 93 | 67_servicebot_workwave_hexaware_datarobot |
| 69 | 68 | 89 | 68_vaidio_olympus_virgo_video |
| 70 | 69 | 82 | 69_vouched_consumer fusion_fusion_identity verification |
| 71 | 70 | 81 | 70_fertility_oma_ivf_alife |
| 72 | 71 | 79 | 71_oil gas_oil_intelligence oil_gas |
| 73 | 72 | 78 | 72_knee_declaire_cowbell_knee replacement |
| 74 | 73 | 78 | 73_gifting_giftpack_getresponse_product recommendations |
| 75 | 74 | 73 | 74_medspa_beautyfix_gleamer_beautyfix medspa |
| 76 | 75 | 70 | 75_intelligence accounting_accounting_accounting market_market |
| 77 | 76 | 70 | 76_octane_octane ai_zero party_party data |
| 78 | 77 | 69 | 77_bigai_bigid_data_pichai |
| 79 | 78 | 68 | 78_pdf_download_book_ebook |
| 80 | 79 | 64 | 79_sermon_franklin_ago_pope |
| 81 | 80 | 57 | 80_biometrics_biometric_recognition_biometrics news |
| 82 | 81 | 52 | 81_wearable_wearable ai_market_ai market |
| 83 | 82 | 51 | 82_citizen_citizen services_services ai_citizen service |
| 84 | 83 | 49 | 83_clarifai_clarifai community_ai resources_community |
| 85 | 84 | 44 | 84_openbench_hemoshear_rare disease_disease target |
| 86 | 85 | 43 | 85_diedrich_rpm_diedrich rpm_clients |
| 87 | 86 | 42 | 86_nft_nfts_trustnft_collateral |
| 88 | 87 | 41 | 87_hum_associations_digital_hum solution |
| 89 | 88 | 41 | 88_railway_railway industry_rail_railways |
| 90 | 89 | 39 | 89_ooda_loopabout ooda_ooda loopabout_loopabout |
| 91 | 90 | 37 | 90_travel_skift_hotel_tourism |
| 92 | 91 | 37 | 91_cardiologs_cardiac_arrhythmias_age groups |
| 93 | 92 | 35 | 92_toggle dropdown_dropdown_toggle_straits times |
| 94 | 93 | 35 | 93_api_traceable_api security_traceable ai |
| 95 | 94 | 34 | 94_weaviate_vector_vector database_ai native |
| 96 | 95 | 33 | 95_sedgwick_sidekick_claims_gpt |
| 97 | 96 | 33 | 96_thrive global_thrive_global_moveworks |
| 98 | 97 | 33 | 97_gridmatrix_port authority_authority_port |
| 99 | 98 | 32 | 98_scaleup_insight partners_insight_partners |
# new_topics_positive_50 = topic_model_positive.reduce_topics(docs, nr_topics=50)
# #topic_mod.reduce_topics(docs, nr_topics = 3)
2023-05-26 05:57:59,732 - BERTopic - Reduced number of topics from 100 to 50
# df_topics_positives_50 = new_topics_positive_50.get_document_info(docs)
# df_topics_positives_50.to_parquet('df_topics_positives_50.parquet', engine='pyarrow')
# freq = new_topics_positive_50.get_topic_info()
# print(f"Topics found: {freq.shape[0]}")
# #freq.head(200)
# #xx topics found
# # Set the display options to show more rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# # Print the dataframe
# # freq.head(50)
Topics found: 50
%%time
new_topics_positive_50.visualize_hierarchy(top_n_topics=50)
CPU times: user 1.87 s, sys: 153 ms, total: 2.02 s Wall time: 2.13 s
new_topics_positive_50.visualize_barchart(top_n_topics=10)
new_topics_positive_50.visualize_term_rank()
#df_topics_positives_100.head(1)
#removing outliers
df_topics_positives_100 = df_topics_positives_100[df_topics_positives_100['Topic'] != -1]
df_topics_negative = df_topics_positives_100
#because im copying code from the negative_sentiment_analysis
#will change back the variable name later
#add date to the negative topics dataframe
df_topics_negative = pd.merge(df_topics_negative, df_positive[["cleaned_text", "date"]], left_on="Document", right_on="cleaned_text", how="inner")
df_topics_negative.drop("cleaned_text", axis=1, inplace=True)
#add url to the negative topics dataframe
df_topics_negative = pd.merge(df_topics_negative, df_positive[["cleaned_text", "url"]], left_on="Document", right_on="cleaned_text", how="inner")
df_topics_negative.drop("cleaned_text", axis=1, inplace=True)
spacy_content_df = pd.read_parquet('content_clean_spacy_final_project.parquet', engine='pyarrow')
#add spacy text (not lower cased and unclean) to the negative topics dataframe
df_topics_negative = pd.merge(df_topics_negative, spacy_content_df[["content_clean", "url"]], left_on="url", right_on="url", how="inner")
#df_topics_negative.drop("cleaned_text", axis=1, inplace=True)
#add title (cleaned) to the negative topics dataframe
df_topics_negative = pd.merge(df_topics_negative, df_positive[["clean_title", "cleaned_text"]], left_on="Document", right_on="cleaned_text", how="inner")
df_topics_negative.drop("cleaned_text", axis=1, inplace=True)
df_topics_negative.shape
#the merges must have messed up a little bit - shape is different from before adding all the columns
(90374, 10)
df_topics_positive = df_topics_negative
df_topics_positive.to_parquet('df_topics_positive_100_v3.parquet', engine='pyarrow')
# df_topics_positive = pd.read_parquet('df_topics_positive_100_v3.parquet', engine='pyarrow')
selected_topics = [2]
chatgpt_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = chatgpt_df[chatgpt_df['Probability'] > 0.99]
# Count the number of rows satisfying the condition
count = chatgpt_filtered.shape[0]
count
#1326 topics chosen from 3209 to plot as most representative according to probability method of Bertopics
2495
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 1.34 s, sys: 48.1 s, total: 49.4 s Wall time: 2min 23s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
from wordcloud import WordCloud
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [4]
healthcare_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
healthcare_df_filtered = healthcare_df[healthcare_df['Probability'] > 0.99]
# Count the number of rows satisfying the condition
count = healthcare_df_filtered.shape[0]
count
#1326 topics chosen from 3209 to plot as most representative according to probability method of Bertopics
1429
import matplotlib.pyplot as plt
healthcare_df_filtered['date'] = pd.to_datetime(healthcare_df_filtered['date'])
# Extract the month and year from the 'date' column
healthcare_df_filtered['month_year'] = healthcare_df_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = healthcare_df_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
# #EU proposed ban to use of AI
# healthcare_df_filtered_022021 = healthcare_df_filtered[healthcare_df_filtered['month_year'] == '2022-10']
# healthcare_df_filtered_022021.head(2)
df_analysis_sample = healthcare_df_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 459 ms, sys: 49.3 s, total: 49.8 s Wall time: 1min 20s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [7]
tesla_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
tesla_df_filtered = tesla_df[tesla_df['Probability'] > 0.99]
# Count the number of rows satisfying the condition
count = tesla_df_filtered.shape[0]
count
#1326 topics chosen from 3209 to plot as most representative according to probability method of Bertopics
605
import matplotlib.pyplot as plt
tesla_df_filtered['date'] = pd.to_datetime(tesla_df_filtered['date'])
# Extract the month and year from the 'date' column
tesla_df_filtered['month_year'] = tesla_df_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = tesla_df_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = tesla_df_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 238 ms, sys: 49.2 s, total: 49.4 s Wall time: 59.6 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [9]
military_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
military_df.shape
(1160, 10)
military_df_filtered = military_df[military_df['Probability'] > 0.5]
# Count the number of rows satisfying the condition
count = military_df_filtered.shape[0]
count
#1326 topics chosen from 3209 to plot as most representative according to probability method of Bertopics
952
chatgpt_filtered = military_df_filtered
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = military_df_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 430 ms, sys: 47.9 s, total: 48.3 s Wall time: 1min 6s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [11]
edu_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
edu_df.shape
(999, 10)
edu_df_filtered = edu_df[edu_df['Probability'] > 0.7]
# Count the number of rows satisfying the condition
count = edu_df_filtered.shape[0]
count
#1326 topics chosen from 3209 to plot as most representative according to probability method of Bertopics
673
chatgpt_filtered = edu_df_filtered
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 296 ms, sys: 49.2 s, total: 49.5 s Wall time: 58.6 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [15]
cyber_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = cyber_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 451 ms, sys: 47.7 s, total: 48.1 s Wall time: 1min 13s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [21]
fashion_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = fashion_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 323 ms, sys: 46.6 s, total: 46.9 s Wall time: 1min 11s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [28]
ag_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = ag_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 297 ms, sys: 46.1 s, total: 46.4 s Wall time: 51.5 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [31]
energy_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = energy_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 204 ms, sys: 45.5 s, total: 45.7 s Wall time: 49.9 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [33]
legal_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = legal_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 274 ms, sys: 46.6 s, total: 46.9 s Wall time: 53.4 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [38]
convai_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = convai_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 198 ms, sys: 44.8 s, total: 45 s Wall time: 51.6 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [41]
quantum_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = quantum_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 215 ms, sys: 47.2 s, total: 47.5 s Wall time: 44.3 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [58]
insurance_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = insurance_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 169 ms, sys: 46.4 s, total: 46.6 s Wall time: 42.9 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
selected_topics = [71]
oil_df = df_topics_negative[df_topics_negative['Topic'].isin(selected_topics)]
chatgpt_filtered = oil_df
import matplotlib.pyplot as plt
chatgpt_filtered['date'] = pd.to_datetime(chatgpt_filtered['date'])
# Extract the month and year from the 'date' column
chatgpt_filtered['month_year'] = chatgpt_filtered['date'].dt.to_period('M')
# Group the documents by month and count the number of documents in each month
documents_by_month = chatgpt_filtered.groupby('month_year').size()
# Create the bar graph
fig = plt.figure(figsize=(12, 6), facecolor='black') # Set the figure size and background color
ax = fig.add_subplot(111)
ax.patch.set_facecolor('black')
documents_by_month.plot(kind='bar', color='skyblue', ax=ax) # Create the bar plot
plt.title('Number of Documents Over Time', color='white') # Add a title to the plot
plt.xlabel('Month', color='white') # Add a label to the x-axis
plt.ylabel('Number of Documents', color='white') # Add a label to the y-axis
plt.xticks(rotation=45, color='white') # Rotate the x-axis labels for better visibility
plt.yticks(color='white')
ax.spines['bottom'].set_color('white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['left'].set_color('white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.grid(False) # Remove gridlines
plt.tight_layout() # Adjust the spacing
plt.show() # Show the plot
df_analysis_sample = chatgpt_filtered
%%time
df_analysis_sample['date'] = pd.to_datetime(df_analysis_sample['date'])
# Define the entity labels of interest
entity_labels = ['ORG', 'PRODUCT', 'PERSON']
# Function to extract entities by label from a single document
def extract_entities(row, label):
doc = nlp(row['content_clean'])
entities = [ent.text for ent in doc.ents if ent.label_ == label]
return entities
# Add year column to dataframe for future use
df_analysis_sample['year'] = df_analysis_sample['date'].dt.year
# Dictionary to store top entities DataFrames by label
df_entities_by_label = {}
# Iterate over each entity label
for label in entity_labels:
df_analysis_sample[label] = df_analysis_sample.parallel_apply(extract_entities, args=(label,), axis=1)
# Generate a dataframe where rows correspond to entities, columns correspond to years, and cells contain counts
entities_by_year = df_analysis_sample.groupby('year')[label].sum().apply(pd.Series.value_counts).unstack().fillna(0)
# Sort the entities by their counts for each year and keep top N entities
N = 30 # you can change this to keep as many top entities as you like
top_entities_by_year = entities_by_year.apply(lambda x: x.nlargest(N) if isinstance(x, pd.Series) else x).unstack()
df_entities_by_label[label] = top_entities_by_year
CPU times: user 190 ms, sys: 46.1 s, total: 46.3 s Wall time: 42.7 s
# Example: Accessing the top entities for 'ORG'
top_org_entities = df_entities_by_label['ORG']
top_person_entities = df_entities_by_label['PERSON']
top_product_entities = df_entities_by_label['PRODUCT']
df = top_org_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_person_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
df = top_product_entities
# Calculate total mentions across all years for each entity
df['total'] = df.sum(axis=1)
# Sort the DataFrame by the total column in descending order
df_sorted = df.sort_values('total', ascending=False)
# Select the top 30 rows (entities)
df_top30 = df_sorted.head(30)
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', width=800, height=400, colormap='viridis')
# Generate the word cloud from the total mentions
wordcloud.generate_from_frequencies(df_top30['total'])
# Plot the word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
#topic 2 - chatgpt
#topic 4 - healthcare (topic 19 - drug discovery)(topic 24 - covid coronavirus)
#topic 5 - computer vision
#topic 7 - automative / tesla
#topic 8 - nvidia
#topic 9 - military intelligence aviation defense
#topic 11 - intelligence education
#topic 15 - cybersecurity
#topic 21 - retail fashion
#topic 22 - iot
#topic 28 - agriculture
#topic 31 - energy intelligence
#topic 32 - food intellgience ai
#topic 33 - legal tech
#topic 34 - dentists
#topic 38 - conversational ai
#topic 41 - quantum computing
#topic 58 - insurance
#topic 71 - oil
# %pip install accelerate==0.18.0
# %pip install transformers==4.25.1
# %pip install datasets==2.1.0
# %pip install sentencepiece==0.1.97
# %pip install ipywidgets==8.0.4
import torch
print("CUDA is available:" , torch.cuda.is_available())
print("Number of CUDA devices:", torch.cuda.device_count())
print("CUDA version used by PyTorch:", torch.version.cuda)
for i in range(torch.cuda.device_count()):
print(torch.cuda.get_device_name(i)) # prints the name of each available GPU
CUDA is available: True Number of CUDA devices: 2 CUDA version used by PyTorch: 11.6 Tesla T4 Tesla T4
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
def generate(prompt, max_new_tokens=100):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return "\n".join(result)
# one of the available flan-t5 models
checkpoint = "google/flan-t5-base"
# loading model and tokenizer based on particular model checkpoint
# tokenizer is used to preprocess text input in a way that the model can understand
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
Downloading (…)lve/main/config.json: 0%| | 0.00/1.40k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/990M [00:00<?, ?B/s]
Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]
%%time
prompt = "summarize the text: Available on Google Cloud\'s Analytics Hub and powered by BigQuery, the AHS provides a comprehensive and up-to-date view of human settlements around the world, delivering precise information on the evolving footprint and density of human activity through the application of artificial intelligence (AI) to satellite imagery.Analysis of the built environment assets affected by a natural disaster can help guide search and rescue efforts, improve distribution of humanitarian aid, and prioritize mid- and long term reconstruction and investment planning said Abe Tarapani, CEO of Atlas AI. We're proud to make the Atlas of Human Settlements dataset covering Turkey and Syria freely available to government and humanitarian agencies supporting the response to this tragic crisis, and we are committed to continuing to enhance this product in the days and weeks to come as needs emerge.With the outdated information offered by the best alternative source, the Global Human Settlement Layer (GHSL) from 2018, the AHS layers offer the most up-to-date pre-crisis data from 2021, are of superior quality, and are delivered with annual updates at a spatial resolution of 10 meters. The package includes three data products, including a built-up surface map, built-up index map, and settlement map. The AHS product is explicitly designed to help response agencies answer questions such as"
response = generate(prompt)
response
CPU times: user 16min 28s, sys: 0 ns, total: 16min 28s Wall time: 4min 43s
'The Atlas of Human Settlements (AHS) is a new tool for assessing the impact of natural disasters on human settlements.'